#!/bin/bash
line_number=$(($1+1))
result=$(sed -n "${line_number}p" /share/projset/dsir7/scripts/fineweb-edu.txt)
SCRIPT_PATH=/share/projset/dsir7
cd $SCRIPT_PATH
CONFIG_FILE=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/feat_config.json
RAW_DATASETS=$result
name=`basename $result .jsonl`
TARGET_DATASETS=/share/projset/dsir7/data/llm-eval-v2-merge.jsonl
NUM_TO_SAMPLE=4200000
MIN_EXAMPLE_LENGTH=0
MERGED_DIR=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/"$name"_llm-eval-v2-merge_0.0_0/resampled
OUTFILE=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/"$name"-llmevalv2.jsonl
# echo $MERGED_DIR
# echo $OUTFILE
bash scripts/search.sh $CONFIG_FILE $RAW_DATASETS $TARGET_DATASETS $NUM_TO_SAMPLE $MIN_EXAMPLE_LENGTH
bash scripts/merged_jsonl.sh $MERGED_DIR $OUTFILE
echo $1."FINISHED"

#!/bin/bash
line_number=$(($1+49))
result=$(sed -n "${line_number}p" /share/projset/dsir7/scripts/fineweb-edu.txt)
SCRIPT_PATH=/share/projset/dsir7
cd $SCRIPT_PATH
CONFIG_FILE=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/feat_config.json
RAW_DATASETS=$result
name=`basename $result .jsonl`
TARGET_DATASETS=/share/projset/dsir7/data/llm-eval-v2-merge.jsonl
NUM_TO_SAMPLE=4200000
MIN_EXAMPLE_LENGTH=0
MERGED_DIR=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/"$name"_llm-eval-v2-merge_0.0_0/resampled
OUTFILE=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/"$name"-llmevalv2.jsonl
# echo $MERGED_DIR
# echo $OUTFILE
bash scripts/search.sh $CONFIG_FILE $RAW_DATASETS $TARGET_DATASETS $NUM_TO_SAMPLE $MIN_EXAMPLE_LENGTH
bash scripts/merged_jsonl.sh $MERGED_DIR $OUTFILE
echo $1."FINISHED2"